#!/usr/bin/env python

import nltk
import urllib2
from nltk import word_tokenize

url = 'http://news.bbc.co.uk/2/hi/health/2284783.stm'
response = urllib2.urlopen(url)
html = response.read().decode('utf8')

# sudo pip install beautifulsoup4
from bs4 import BeautifulSoup
raw = BeautifulSoup(html).get_text()
tokens = word_tokenize(raw)
print tokens[:100]

tokens = tokens[110:390]
text = nltk.Text(tokens)
print text.concordance('gene')